The Ironman Triathlon is considered one of the most difficult sporting events in the world. This one-day race consists of a 3.86 km swim, a 180.25 km bicycle ride, and a marathon 42.20 km run; people completing it within the strict time cutoffs are agreed to be recognized as “Ironmen”. In this report, we have the records of many activities from an athlete who participates in the Ironman event regularly and we would like to answer some questions, the main one is:
This data is quite messy since it is a real record of sports activity and there are several sports involved and a lot of activities, next we are focusing in analysis just some statistics of the activities and create monthly time series out of it.
First, let us take a view at the dataset, and to examine some of the metrics for each feature, including the NaN
## First things first
summary(dat_clean)
## activityId uuidMsb uuidLsb name
## Length:5125 Length:5125 Length:5125 Length:5125
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## activityType userProfileId timeZoneId beginTimestamp
## Length:5125 Length:5125 Length:5125 Min. :1.325e+12
## Class :character Class :character Class :character 1st Qu.:1.413e+12
## Mode :character Mode :character Mode :character Median :1.476e+12
## Mean :1.471e+12
## 3rd Qu.:1.527e+12
## Max. :1.601e+12
## NA's :10
## eventTypeId rule sportType startTimeGmt
## Length:5125 Length:5125 Length:5125 Min. :1.325e+12
## Class :character Class :character Class :character 1st Qu.:1.414e+12
## Mode :character Mode :character Mode :character Median :1.476e+12
## Mean :1.471e+12
## 3rd Qu.:1.527e+12
## Max. :1.601e+12
##
## startTimeLocal duration distance elevationGain
## Min. :1.325e+12 Min. : 0.0129 Min. : 0.00 Min. : 0.0
## 1st Qu.:1.414e+12 1st Qu.: 37.7151 1st Qu.: 3.20 1st Qu.: 57.0
## Median :1.476e+12 Median : 60.1362 Median : 11.89 Median : 109.0
## Mean :1.471e+12 Mean : 69.5637 Mean : 19.49 Mean : 386.8
## 3rd Qu.:1.527e+12 3rd Qu.: 78.1002 3rd Qu.: 16.72 3rd Qu.: 313.9
## Max. :1.601e+12 Max. :893.7831 Max. :236.10 Max. :34046.4
## NA's :7 NA's :1119
## elevationLoss avgSpeed maxSpeed avgHr
## Min. : 0.0 Min. : 0.000 Min. : 0.00 Min. : 0.0
## 1st Qu.: 52.0 1st Qu.: 3.542 1st Qu.: 15.22 1st Qu.:134.0
## Median : 101.0 Median :12.449 Median : 25.24 Median :144.0
## Mean : 352.1 Mean :14.371 Mean : 33.07 Mean :141.2
## 3rd Qu.: 301.8 3rd Qu.:22.054 3rd Qu.: 46.01 3rd Qu.:153.0
## Max. :34046.4 Max. :47.657 Max. :6167.64 Max. :243.0
## NA's :1119 NA's :43 NA's :659 NA's :1717
## maxHr calories startLongitude startLatitude
## Min. : 0.0 Min. : 0.0 Min. :-2.911 Min. : 0.00
## 1st Qu.:156.0 1st Qu.: 481.7 1st Qu.: 2.260 1st Qu.:48.22
## Median :168.0 Median : 777.0 Median : 2.272 Median :48.81
## Mean :165.3 Mean : 892.5 Mean : 4.947 Mean :47.85
## 3rd Qu.:176.0 3rd Qu.: 1048.8 3rd Qu.: 4.346 3rd Qu.:48.89
## Max. :254.0 Max. :10028.4 Max. :19.968 Max. :55.67
## NA's :1673 NA's :53 NA's :1516 NA's :1516
## aerobicTrainingEffect avgFractionalCadence maxFractionalCadence
## Min. :0.000 Min. :0.00000 Min. :0.00000
## 1st Qu.:2.100 1st Qu.:0.00000 1st Qu.:0.00000
## Median :3.000 Median :0.00000 Median :0.00000
## Mean :2.876 Mean :0.07489 Mean :0.03327
## 3rd Qu.:3.700 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :5.000 Max. :0.99219 Max. :0.50000
## NA's :2490
## elapsedDuration movingDuration anaerobicTrainingEffect deviceId
## Min. : 0.00 Min. : 0.00 Min. :0.000 Length:5125
## 1st Qu.: 42.28 1st Qu.: 30.02 1st Qu.:0.000 Class :character
## Median : 67.13 Median : 51.19 Median :0.000 Mode :character
## Mean : 81.91 Mean : 59.61 Mean :0.298
## 3rd Qu.: 90.01 3rd Qu.: 74.16 3rd Qu.:0.200
## Max. :811.04 Max. :443.12 Max. :4.200
## NA's :2686 NA's :3273 NA's :4198
## minTemperature maxTemperature minElevation maxElevation
## Min. :-4.00 Min. : 7.00 Min. :-324.0 Min. : -46.6
## 1st Qu.:14.00 1st Qu.:24.00 1st Qu.: 28.6 1st Qu.: 98.0
## Median :19.00 Median :27.00 Median : 74.6 Median : 159.6
## Mean :18.78 Mean :26.27 Mean : 149.1 Mean : 278.4
## 3rd Qu.:24.00 3rd Qu.:29.00 3rd Qu.: 179.3 3rd Qu.: 268.8
## Max. :33.00 Max. :37.00 Max. :6892.8 Max. :11308.0
## NA's :2628 NA's :2628 NA's :2758 NA's :2758
## locationName maxVerticalSpeed lapCount endLongitude
## Length:5125 Min. : 0.000 Min. : 1.00 Min. :-0.596
## Class :character 1st Qu.: 2.880 1st Qu.: 3.00 1st Qu.: 6.472
## Mode :character Median : 4.320 Median : 12.00 Median :16.303
## Mean : 8.525 Mean : 15.78 Mean :12.532
## 3rd Qu.: 7.200 3rd Qu.: 21.00 3rd Qu.:16.352
## Max. :344.160 Max. :102.00 Max. :19.048
## NA's :3762 NA's :3454 NA's :4316
## endLatitude activeSets totalSets totalReps purposeful
## Min. :44.49 Min. :0 Min. :0 Min. :0 Mode :logical
## 1st Qu.:48.20 1st Qu.:0 1st Qu.:0 1st Qu.:0 FALSE:5125
## Median :48.22 Median :0 Median :0 Median :0
## Mean :48.28 Mean :0 Mean :0 Mean :0
## 3rd Qu.:48.24 3rd Qu.:0 3rd Qu.:0 3rd Qu.:0
## Max. :52.56 Max. :0 Max. :0 Max. :0
## NA's :4316 NA's :4293 NA's :4293 NA's :4293
## autoCalcCalories favorite pr elevationCorrected
## Mode :logical Mode :logical Mode :logical Min. :0.00
## FALSE:2360 FALSE:5124 FALSE:5117 1st Qu.:0.00
## TRUE :19 TRUE :1 TRUE :8 Median :0.00
## NA's :2746 Mean :0.00
## 3rd Qu.:0.00
## Max. :0.01
## NA's :2137
## atpActivity parent maxRunCadence steps
## Mode :logical Mode :logical Min. : 0.0 Min. : 0
## FALSE:1350 FALSE:3819 1st Qu.: 96.0 1st Qu.: 7602
## NA's :3775 TRUE :52 Median :103.0 Median : 10612
## NA's :1254 Mean :103.4 Mean : 35225
## 3rd Qu.:114.0 3rd Qu.: 12212
## Max. :193.0 Max. :9999999
## NA's :4333 NA's :4335
## avgVerticalOscillation avgGroundContactTime avgStrideLength vO2MaxValue
## Min. : 1.650 Min. :213.0 Length:5125 Min. :46.00
## 1st Qu.: 8.540 1st Qu.:242.4 Class :character 1st Qu.:57.00
## Median : 8.880 Median :250.5 Mode :character Median :58.00
## Mean : 8.758 Mean :252.2 Mean :59.16
## 3rd Qu.: 9.160 3rd Qu.:258.1 3rd Qu.:60.00
## Max. :10.280 Max. :504.8 Max. :74.00
## NA's :4531 NA's :4531 NA's :4279
## avgVerticalRatio avgGroundContactBalance avgDoubleCadence maxDoubleCadence
## Min. : 0.780 Min. :42.43 Min. : 0.0 Min. : 0.0
## 1st Qu.: 7.340 1st Qu.:49.04 1st Qu.:170.4 1st Qu.:193.0
## Median : 7.700 Median :49.48 Median :172.6 Median :207.0
## Mean : 7.729 Mean :49.43 Mean :161.7 Mean :211.3
## 3rd Qu.: 8.080 3rd Qu.:49.89 3rd Qu.:174.6 3rd Qu.:229.0
## Max. :10.550 Max. :51.47 Max. :205.2 Max. :256.0
## NA's :4531 NA's :4531 NA's :4354 NA's :4352
## avgPower avgBikeCadence maxBikeCadence strokes
## Min. : 0.0 Min. : 47.00 Min. : 90.0 Min. : 0
## 1st Qu.:224.0 1st Qu.: 86.00 1st Qu.:107.0 1st Qu.: 1343
## Median :239.0 Median : 90.00 Median :111.0 Median : 4894
## Mean :240.6 Mean : 91.16 Mean :122.1 Mean : 5546
## 3rd Qu.:261.0 3rd Qu.:100.00 3rd Qu.:125.0 3rd Qu.: 8458
## Max. :347.0 Max. :111.00 Max. :253.0 Max. :29374
## NA's :4464 NA's :3811 NA's :3821 NA's :2964
## normPower avgLeftBalance avgRightBalance max20MinPower
## Min. : 0.0 Min. :49.84 Min. :50.07 Min. :105.1
## 1st Qu.:231.0 1st Qu.:49.90 1st Qu.:50.08 1st Qu.:235.0
## Median :267.0 Median :49.91 Median :50.09 Median :268.7
## Mean :260.6 Mean :49.91 Mean :50.09 Mean :272.5
## 3rd Qu.:288.0 3rd Qu.:49.92 3rd Qu.:50.10 3rd Qu.:309.5
## Max. :365.3 Max. :49.93 Max. :50.16 Max. :389.9
## NA's :4464 NA's :4529 NA's :4529 NA's :4471
## trainingStressScore intensityFactor lactateThresholdBpm lactateThresholdSpeed
## Min. : 0.00 Min. :0.000 Min. :159.0 Min. :13.20
## 1st Qu.: 77.55 1st Qu.:0.780 1st Qu.:168.0 1st Qu.:13.80
## Median : 150.55 Median :0.956 Median :169.0 Median :14.40
## Mean : 239.78 Mean :1.046 Mean :168.4 Mean :14.24
## 3rd Qu.: 343.65 3rd Qu.:1.360 3rd Qu.:170.0 3rd Qu.:14.70
## Max. :1465.00 Max. :1.666 Max. :171.0 Max. :15.20
## NA's :4513 NA's :4513 NA's :5052 NA's :5052
## avgStrokes activeLengths avgSwolf poolLength
## Min. : 0.00 Min. : 0.00 Min. :32.00 Min. :2500
## 1st Qu.: 10.60 1st Qu.: 0.00 1st Qu.:37.00 1st Qu.:2500
## Median : 11.00 Median : 61.50 Median :39.00 Median :2500
## Mean : 43.62 Mean : 61.79 Mean :46.84 Mean :3120
## 3rd Qu.: 20.75 3rd Qu.:120.00 3rd Qu.:52.75 3rd Qu.:3300
## Max. :2221.00 Max. :178.00 Max. :85.00 Max. :5000
## NA's :4292 NA's :3795 NA's :4295 NA's :4475
## avgStrokeDistance avgSwimCadence maxSwimCadence maxFtp
## Min. :107.0 Min. : 7.00 Min. : 9.00 Min. :225.0
## 1st Qu.:219.0 1st Qu.:24.00 1st Qu.: 30.00 1st Qu.:225.0
## Median :227.0 Median :25.00 Median : 31.00 Median :225.0
## Mean :225.1 Mean :24.99 Mean : 33.54 Mean :229.9
## 3rd Qu.:235.0 3rd Qu.:27.00 3rd Qu.: 32.00 3rd Qu.:225.0
## Max. :417.0 Max. :37.00 Max. :160.00 Max. :354.0
## NA's :4619 NA's :4261 NA's :4261 NA's :5002
## workoutId decoDive parentId avgVerticalSpeed
## Length:5125 Mode :logical Length:5125 Min. :0
## Class :character FALSE:4 Class :character 1st Qu.:0
## Mode :character NA's :5121 Mode :character Median :0
## Mean :0
## 3rd Qu.:0
## Max. :0
## NA's :5120
## maxDepth avgDepth surfaceInterval floorsDescended bottomTime
## Min. :0 Min. :0 Min. :0 Min. :0 Min. :0
## 1st Qu.:0 1st Qu.:0 1st Qu.:0 1st Qu.:0 1st Qu.:0
## Median :0 Median :0 Median :0 Median :0 Median :0
## Mean :0 Mean :0 Mean :0 Mean :0 Mean :0
## 3rd Qu.:0 3rd Qu.:0 3rd Qu.:0 3rd Qu.:0 3rd Qu.:0
## Max. :0 Max. :0 Max. :0 Max. :0 Max. :0
## NA's :5124 NA's :5124 NA's :5124 NA's :5124 NA's :5124
## start_time date is_bike
## Min. :2012-01-01 15:15:45 Min. :2012-01-01 00:00:00 Mode :logical
## 1st Qu.:2014-10-17 09:19:29 1st Qu.:2014-10-17 00:00:00 FALSE:2633
## Median :2016-10-14 17:27:33 Median :2016-10-14 00:00:00 TRUE :2492
## Mean :2016-08-09 13:39:19 Mean :2016-08-08 22:59:18
## 3rd Qu.:2018-05-20 09:50:23 3rd Qu.:2018-05-20 00:00:00
## Max. :2020-09-21 17:00:51 Max. :2020-09-21 00:00:00
##
## is_run activity_recoded qual_distance qual_avgHr
## Mode :logical Length:5125 Length:5125 Length:5125
## FALSE:3640 Class :character Class :character Class :character
## TRUE :1485 Mode :character Mode :character Mode :character
##
##
##
##
From the structure above, is easy to find calories, avgSpeed, distance, and duration with less than 50 NaN, and their mean and median are relatively close, then from now, these will be the features to be studied here.
dat_clean <- select(dat_clean,
c( "activityType", "activity_recoded", "date", ## Selecting the metrics to be studied
"calories", "avgSpeed", "distance", "duration")) %>%
mutate(months = month(date),
years = year(date)) %>% ## Creating new columns for month and year
mutate(longitude = avgSpeed*duration) ## Creating the length column
Next, the plot of all these features with respect to the time for each activity recorded is presented. As we can see, in the middle of each year (from 2015 approx) we have a strong signal of the other activity class for all the metrics, this seasonality is also clear for bike from 2014, with strong pecks in calories, distance, duration and even for avgSpeed, where wide pecks appear. From the distance and duration graphs, we can see that the max points are near 200km for the bike, 50km for the run, and 5km for the swim, achieved in 400m, 200/300m, and 75m respectively from 2015.
cal_type <- ggplot(data = dat_clean, aes(x=date )) +
geom_line(aes(y = calories, color = activity_recoded)) +
facet_wrap(. ~ activity_recoded, ncol = 1, scales = 'free_y')
cal_type
avgspeed_type <- ggplot(data = dat_clean, aes(x=date )) +
geom_line(aes(y = avgSpeed, color = activity_recoded)) +
facet_wrap(. ~ activity_recoded, ncol = 1, scales = 'free_y')
avgspeed_type
dist_type <- ggplot(data = dat_clean, aes(x=date )) +
geom_line(aes(y = distance, color = activity_recoded)) +
facet_wrap(. ~ activity_recoded, ncol = 1, scales = 'free_y')
dist_type
dur_type <- ggplot(data = dat_clean, aes(x=date )) +
geom_line(aes(y = duration, color = activity_recoded)) +
facet_wrap(. ~ activity_recoded, ncol = 1, scales = 'free_y')
dur_type
To explore the metrics seasonality, we will divide the activities into four groups: bike, run, swim, and others.
## creating the series for each activity
dat_bike <- filter(dat_clean, activity_recoded =="Bike") # Bike activities
dat_run <- filter(dat_clean, activity_recoded =="Run") # Run activities
dat_swim <- filter(dat_clean, activity_recoded =="Swim") # Swim activities
dat_others <- filter(dat_clean, activity_recoded == "Other") # other activities
For now, our efforts are focused on bike activities. To aggregate the metrics, first, we will group by month and then for a year, and in each subgroup, the average will be calculated in the standard way except for the speed, where it is calculated as the mean of the longitudes divided by the mean of its duration.
test_graph <- dat_bike %>%
group_by(months, years) %>%
summarize(avg_cal = mean(calories,na.rm=T),
avg_sp = mean(longitude,na.rm=T)/mean(duration,na.rm=T),
avg_distance =mean(distance,na.rm = T),
avg_duration =mean(duration,na.rm = T),
datetime = first(date),
.groups="keep"
)
One question arises when is taken into account the activity type in this category, to see these effects another data frame was built, followed by the respective graph.
test_graph_type <- dat_bike %>% #dat_bike %>%
group_by(months, years, activityType) %>%
summarize(avg_cal = mean(calories,na.rm=T),
avg_sp = mean(longitude,na.rm=T)/mean(duration,na.rm=T),
avg_distance =mean(distance,na.rm = T),
avg_duration =mean(duration,na.rm = T),
datetime = first(date),
.groups="keep"
)
dur1 <- ggplot() + geom_line(aes(test_graph$datetime, test_graph$avg_duration), colour = 'red') +
geom_line(aes(test_graph_type$datetime, test_graph_type$avg_duration), colour = 'blue', alpha = 0.5)
dist1 <- ggplot() + geom_line(aes(test_graph$datetime, test_graph$avg_distance), colour = 'red') +
geom_line(aes(test_graph_type$datetime, test_graph_type$avg_distance), colour = 'blue', alpha = 0.5)
avgsp1 <- ggplot() + geom_line(aes(test_graph$datetime, test_graph$avg_sp), colour = 'red') +
geom_line(aes(test_graph_type$datetime, test_graph_type$avg_sp), colour = 'blue', alpha = 0.5)
cal1 <- ggplot() + geom_line(aes(test_graph$datetime, test_graph$avg_cal), colour = 'red') +
geom_line(aes(test_graph_type$datetime, test_graph_type$avg_cal), colour = 'blue', alpha = 0.5)
cowplot::plot_grid(cal1, avgsp1, dist1, dur1, labels = "AUTO")
Despite the difference observed between the mean of each metric when calculated taking into account the category type (blue line) and without it (red) we can discard this additional aggregation and use the dataframe test_graph displayed by the red lines
We built the monthly time series object for the four metrics in the bike activity and take a look at its three components: trend, seasonal, and noise from January 2012. In addition, we can see the lags autorcorrelation for each feature using the acf function.
freq = 12
ts_cal <- ts(test_graph$avg_cal, frequency = freq, start = c(2012, 1))
ts_cal_decompose <- decompose(ts_cal)
ts_sp <- ts(test_graph$avg_sp, frequency = freq, start = c(2012, 1))
ts_sp_decompose <- decompose(ts_sp)
ts_dis <- ts(test_graph$avg_distance, frequency = freq, start = c(2012, 1))
ts_dis_decompose <- decompose(ts_dis)
ts_dur <- ts(test_graph$avg_duration, frequency = freq, start = c(2012, 1))
ts_dur_decompose <- decompose(ts_dur)
First, in the calorie measurements, we see two trends, the up one is from 2012 until 2016 approx and a downtrend is observed from then. Also, there is a seasonal pattern, apparently stationary, with a peak in the middle of the year, which is in accordance with the previous observations.
In the autocorrelation plot, we have a significant correlation with the first lag and with the 8, 9, and 10 lags. Apparently, the relation between monthly calorie is extended for one and a half years, however, we can think that these lags are above the significance bounds for the chance. Therefore, we expect a model with a moving average of order q = 1.
plot(ts_cal_decompose)
acf(x = ts_cal, lag.max = 36)
In the case of average speed, we see an up and down-trend and a stationary seasonal component, very similar to the calories graph. In the correlogram, we see a significant correlation between lags until 1 year approximately, however just the first one is strong, then we expect a q =1.
plot(ts_sp_decompose)
acf(x = ts_sp, lag.max = 24)
Decomposing the distance time series, we recognize the similarities with the previous analysis, even for the autocorrelation function.
plot(ts_dis_decompose)
acf(x = ts_dis, lag.max = 24)
Finally, for the duration observations, the acf graph is similar to the one for calories, which seems a reasonable result, and again we expect a q close to 1.
plot(ts_dur_decompose)
acf(x = ts_dur, lag.max = 24)
Before using any model, we need to verify if these series are really stationary in mean and variance, for this we will use two tests: Kwiatkowski-Phillips-Schmidt-Shin and the Phillips-Perron test. A large p-value (i.e. above 0.05) will indicate a high risk if the null hypothesis is rejected.
kpss.test(ts_cal, null = 'Level') #Kwiatkowski-Phillips-Schmidt-Shin test
## Warning in kpss.test(ts_cal, null = "Level"): p-value greater than printed p-
## value
##
## KPSS Test for Level Stationarity
##
## data: ts_cal
## KPSS Level = 0.26474, Truncation lag parameter = 4, p-value = 0.1
tseries::pp.test(ts_cal)
## Warning in tseries::pp.test(ts_cal): p-value smaller than printed p-value
##
## Phillips-Perron Unit Root Test
##
## data: ts_cal
## Dickey-Fuller Z(alpha) = -70.038, Truncation lag parameter = 4, p-value
## = 0.01
## alternative hypothesis: stationary
For the kpss test, the p-value indicates the \(H_0\) should be accepted, and we consider the calories series stationary.
For the pp test, the p-value indicates the \(H_0\) should be rejected, and consider the calories series stationary.
kpss.test(ts_sp, null = 'Level') #Kwiatkowski-Phillips-Schmidt-Shin test
##
## KPSS Test for Level Stationarity
##
## data: ts_sp
## KPSS Level = 0.4577, Truncation lag parameter = 4, p-value = 0.05228
tseries::pp.test(ts_sp)
## Warning in tseries::pp.test(ts_sp): p-value smaller than printed p-value
##
## Phillips-Perron Unit Root Test
##
## data: ts_sp
## Dickey-Fuller Z(alpha) = -64.886, Truncation lag parameter = 4, p-value
## = 0.01
## alternative hypothesis: stationary
For the kpss test, the p-value indicates the \(H_0\) should be accepted, and we consider the AvgSpeed series stationary.
For the pp test, the p-value indicates the \(H_0\) should be rejected, and consider the AvgSpeed series stationary.
kpss.test(ts_dis, null = 'Level') #Kwiatkowski-Phillips-Schmidt-Shin test
##
## KPSS Test for Level Stationarity
##
## data: ts_dis
## KPSS Level = 0.43003, Truncation lag parameter = 4, p-value = 0.06421
tseries::pp.test(ts_dis)
## Warning in tseries::pp.test(ts_dis): p-value smaller than printed p-value
##
## Phillips-Perron Unit Root Test
##
## data: ts_dis
## Dickey-Fuller Z(alpha) = -69.628, Truncation lag parameter = 4, p-value
## = 0.01
## alternative hypothesis: stationary
For the kpss test, the p-value indicates the \(H_0\) should be accepted, and we consider the distance series stationary.
For the pp test, the p-value indicates the \(H_0\) should be rejected, and consider the distance series stationary.
kpss.test(ts_dur, null = 'Level') #Kwiatkowski-Phillips-Schmidt-Shin test
## Warning in kpss.test(ts_dur, null = "Level"): p-value greater than printed p-
## value
##
## KPSS Test for Level Stationarity
##
## data: ts_dur
## KPSS Level = 0.32719, Truncation lag parameter = 4, p-value = 0.1
tseries::pp.test(ts_dur)
## Warning in tseries::pp.test(ts_dur): p-value smaller than printed p-value
##
## Phillips-Perron Unit Root Test
##
## data: ts_dur
## Dickey-Fuller Z(alpha) = -72.047, Truncation lag parameter = 4, p-value
## = 0.01
## alternative hypothesis: stationary
For the kpss test, the p-value indicates the \(H_0\) should be accepted, and we consider the duration series stationary.
For the pp test, the p-value indicates the \(H_0\) should be rejected, and consider the duration series stationary.
After checking the stationarity of our series, let try the prophet and auto.arima algorithms to predict calories, avgSpeed, distance, and duration 12 months ahead.
df_ts_cal <- data.frame(y = as.matrix(ts_cal),
ds = as.Date(as.yearmon(time(ts_cal)))) #prophet accepts df with colums ds and y
prophet_model <- prophet(df = df_ts_cal, n.changepoints = 40,
daily.seasonality = FALSE,
weekly.seasonality = FALSE,
yearly.seasonality = TRUE,
holidays = NULL,
seasonality.mode = 'additive') # instancing the model
future <- make_future_dataframe(prophet_model, periods = 12, freq = 'month') # Creating the df for store current and predicted values
forecast <- predict(prophet_model, future) # Actual forecast
## Some visualization
dyplot.prophet(prophet_model, forecast)
## Warning: `select_()` is deprecated as of dplyr 0.7.0.
## Please use `select()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
prophet_plot_components(prophet_model, forecast)
tail(forecast[c('ds', 'yhat')], 12)
## ds yhat
## 106 2020-10-01 1112.0596
## 107 2020-11-01 1077.3970
## 108 2020-12-01 1009.1717
## 109 2021-01-01 1144.5503
## 110 2021-02-01 857.5232
## 111 2021-03-01 969.4464
## 112 2021-04-01 926.7622
## 113 2021-05-01 996.6521
## 114 2021-06-01 977.6044
## 115 2021-07-01 1056.2431
## 116 2021-08-01 1007.0368
## 117 2021-09-01 1228.8467
Using the prophet algorithm and plotting the components we expect a little break around February 2021 and a maximum caloric expenditure happens between July and September. In September we expect the highest value around 1228.85.
The best model is selected according to their AIC value. Since we have a monthly seasonality, we will use D = 1 as one of the parameters. This choosing is also supported by the low AIC obtained when we use this parameter instead of D = 0 in the Auto.Arima function:
arima_model <- auto.arima(ts_cal,
D = 1,
approximation = FALSE,
seasonal = TRUE,
allowmean = FALSE,
allowdrift = FALSE,
trace = TRUE)
##
## ARIMA(2,0,2)(1,1,1)[12] : Inf
## ARIMA(0,0,0)(0,1,0)[12] : 1469.818
## ARIMA(1,0,0)(1,1,0)[12] : 1443.049
## ARIMA(0,0,1)(0,1,1)[12] : Inf
## ARIMA(1,0,0)(0,1,0)[12] : 1464.003
## ARIMA(1,0,0)(2,1,0)[12] : 1430.675
## ARIMA(1,0,0)(2,1,1)[12] : Inf
## ARIMA(1,0,0)(1,1,1)[12] : Inf
## ARIMA(0,0,0)(2,1,0)[12] : 1439.977
## ARIMA(2,0,0)(2,1,0)[12] : 1432.263
## ARIMA(1,0,1)(2,1,0)[12] : 1432.315
## ARIMA(0,0,1)(2,1,0)[12] : 1430.566
## ARIMA(0,0,1)(1,1,0)[12] : 1440.495
## ARIMA(0,0,1)(2,1,1)[12] : Inf
## ARIMA(0,0,1)(1,1,1)[12] : Inf
## ARIMA(0,0,2)(2,1,0)[12] : 1432.158
## ARIMA(1,0,2)(2,1,0)[12] : 1431.507
##
## Best model: ARIMA(0,0,1)(2,1,0)[12]
The best model for our calories time series is the Arima(p = 0, d = 0, q = 1)(P = 2, D = 1, Q = 0), Next, lets take a look at the partial autocorrelation function to check the order of the autoregressive term.
pacf(ts_cal)
From this plot, we see a low correlation with the first lag, which is in accordance with the model selected using AutoArima. Now we look at the residuals plot:
checkresiduals(arima_model)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,0,1)(2,1,0)[12]
## Q* = 50.693, df = 18, p-value = 5.926e-05
##
## Model df: 3. Total lags used: 21
The residuals do not look like white noise since their mean and standard deviation not seems stationary, in the ACF plot we can see an apparent correlation at lag 9, and in the distribution plot, we cannot see normal distribution. Finally, the Ljung-Box test, which examines autocorrelation of the residuals, indicates that the model did not capture all the information, and we have some lack in the fit.
Being aware of this, let us do a prediction for calories:
fcast_arima <- forecast(arima_model, h = 12)
fcast_arima$mean
## Jan Feb Mar Apr May Jun Jul
## 2020
## 2021 1170.9341 695.1321 656.9806 951.0325 924.9077 695.4906 813.9202
## Aug Sep Oct Nov Dec
## 2020 911.8660 1082.7614 1178.8073
## 2021 1137.0221 1420.8499
plot(fcast_arima)
We expect a spent in calories around 911.8660 by October and 1082.7614 in November approximately. Also, we have a high value in September (1420.85) which is in accordance with the prophet predictions.
df_ts <- data.frame(y = as.matrix(ts_sp),
ds = as.Date(as.yearmon(time(ts_sp)))) #prophet accepts df with colums ds and y
prophet_model <- prophet(df = df_ts, n.changepoints = 40,
daily.seasonality = FALSE,
weekly.seasonality = FALSE,
yearly.seasonality = TRUE,
holidays = NULL,
seasonality.mode = 'additive') # instancing the model
future <- make_future_dataframe(prophet_model, periods = 12, freq = 'month') # Creating the df for store current and predicted values
forecast <- predict(prophet_model, future) # Actual forecast
tail(forecast[c('ds', 'yhat')], 12)
## ds yhat
## 106 2020-10-01 14.63863
## 107 2020-11-01 15.78994
## 108 2020-12-01 16.79719
## 109 2021-01-01 17.84012
## 110 2021-02-01 15.99719
## 111 2021-03-01 14.32454
## 112 2021-04-01 12.06729
## 113 2021-05-01 17.08753
## 114 2021-06-01 13.85389
## 115 2021-07-01 14.12790
## 116 2021-08-01 13.25529
## 117 2021-09-01 14.95885
## Some graphics
dyplot.prophet(prophet_model, forecast)
prophet_plot_components(prophet_model, forecast)
We have a strong signal between Feb and April, and a high average speed in July as before.
arima_model <- auto.arima(ts_sp,
D = 1,
approximation = FALSE,
seasonal = TRUE,
allowmean = FALSE,
allowdrift = FALSE,
trace = TRUE)
##
## ARIMA(2,1,2)(1,1,1)[12] : 659.89
## ARIMA(0,1,0)(0,1,0)[12] : 729.5152
## ARIMA(1,1,0)(1,1,0)[12] : 676.9559
## ARIMA(0,1,1)(0,1,1)[12] : 653.5562
## ARIMA(0,1,1)(0,1,0)[12] : 675.8381
## ARIMA(0,1,1)(1,1,1)[12] : 655.707
## ARIMA(0,1,1)(0,1,2)[12] : 655.704
## ARIMA(0,1,1)(1,1,0)[12] : 662.6769
## ARIMA(0,1,1)(1,1,2)[12] : Inf
## ARIMA(0,1,0)(0,1,1)[12] : 701.4226
## ARIMA(1,1,1)(0,1,1)[12] : 654.3206
## ARIMA(0,1,2)(0,1,1)[12] : 654.4844
## ARIMA(1,1,0)(0,1,1)[12] : 668.0797
## ARIMA(1,1,2)(0,1,1)[12] : 655.493
##
## Best model: ARIMA(0,1,1)(0,1,1)[12]
#summary(arima_model)
checkresiduals(arima_model)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,1,1)(0,1,1)[12]
## Q* = 21.898, df = 19, p-value = 0.2894
##
## Model df: 2. Total lags used: 21
fcast_arima <- forecast(arima_model, h = 12)
fcast_arima$mean
## Jan Feb Mar Apr May Jun Jul
## 2020
## 2021 12.575058 8.233577 9.181427 5.259853 12.283333 9.144843 6.317709
## Aug Sep Oct Nov Dec
## 2020 8.141401 11.086128 12.121597
## 2021 7.664674 12.821173
plot(fcast_arima)
In this case, the best model is the Arima(p = 0, d = 1, q = 1)(P = 0, D = 1, Q = 1). This tells us that was necessary to do 1 differentiation in order to make the series stationary in both trend and seasonality, and we have a moving average the order 1. The residuals do not have correlations and are basically white noise, and we expect for the next 2 months and an average of 8.14 and 11.09m/s, a maximum value is reached in September.
df_ts <- data.frame(y = as.matrix(ts_dis),
ds = as.Date(as.yearmon(time(ts_dis)))) #prophet accepts df with colums ds and y
prophet_model <- prophet(df = df_ts, n.changepoints = 40,
daily.seasonality = FALSE,
weekly.seasonality = FALSE,
yearly.seasonality = TRUE,
holidays = NULL,
seasonality.mode = 'additive') # instancing the model
future <- make_future_dataframe(prophet_model, periods = 12, freq = 'month') # Creating the df for store current and predicted values
forecast <- predict(prophet_model, future) # Actual forecast
tail(forecast[c('ds', 'yhat')], 12)
## ds yhat
## 106 2020-10-01 29.85411
## 107 2020-11-01 35.89716
## 108 2020-12-01 32.25534
## 109 2021-01-01 37.05529
## 110 2021-02-01 26.17171
## 111 2021-03-01 28.64615
## 112 2021-04-01 27.04608
## 113 2021-05-01 32.12230
## 114 2021-06-01 30.77406
## 115 2021-07-01 36.51051
## 116 2021-08-01 31.01988
## 117 2021-09-01 37.10404
## Some graphics
dyplot.prophet(prophet_model, forecast)
prophet_plot_components(prophet_model, forecast)
For distance, we have, again, a high signal in July (31.02) and September (37.10).
arima_model <- auto.arima(ts_dis,
D = 1,
approximation = FALSE,
seasonal = TRUE,
allowmean = FALSE,
allowdrift = FALSE,
trace = TRUE)
##
## ARIMA(2,1,2)(1,1,1)[12] : 836.8524
## ARIMA(0,1,0)(0,1,0)[12] : 909.2821
## ARIMA(1,1,0)(1,1,0)[12] : 865.6824
## ARIMA(0,1,1)(0,1,1)[12] : Inf
## ARIMA(2,1,2)(0,1,1)[12] : Inf
## ARIMA(2,1,2)(1,1,0)[12] : 844.9217
## ARIMA(2,1,2)(2,1,1)[12] : 839.0323
## ARIMA(2,1,2)(1,1,2)[12] : 839.1997
## ARIMA(2,1,2)(0,1,0)[12] : 872.3796
## ARIMA(2,1,2)(0,1,2)[12] : 836.8102
## ARIMA(1,1,2)(0,1,2)[12] : 834.9287
## ARIMA(1,1,2)(0,1,1)[12] : Inf
## ARIMA(1,1,2)(1,1,2)[12] : Inf
## ARIMA(1,1,2)(1,1,1)[12] : 834.9413
## ARIMA(0,1,2)(0,1,2)[12] : 832.6516
## ARIMA(0,1,2)(0,1,1)[12] : Inf
## ARIMA(0,1,2)(1,1,2)[12] : 834.942
## ARIMA(0,1,2)(1,1,1)[12] : 832.6622
## ARIMA(0,1,1)(0,1,2)[12] : 830.7035
## ARIMA(0,1,1)(1,1,2)[12] : 832.9412
## ARIMA(0,1,1)(1,1,1)[12] : 830.7143
## ARIMA(0,1,0)(0,1,2)[12] : 868.2806
## ARIMA(1,1,1)(0,1,2)[12] : 832.6394
## ARIMA(1,1,0)(0,1,2)[12] : 848.9428
##
## Best model: ARIMA(0,1,1)(0,1,2)[12]
#summary(arima_model)
checkresiduals(arima_model)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,1,1)(0,1,2)[12]
## Q* = 32.789, df = 18, p-value = 0.0177
##
## Model df: 3. Total lags used: 21
fcast_arima <- forecast(arima_model, h = 12)
fcast_arima$mean
## Jan Feb Mar Apr May Jun Jul
## 2020
## 2021 18.800320 7.225705 7.428023 8.688751 11.891255 11.930237 13.901551
## Aug Sep Oct Nov Dec
## 2020 5.580981 16.538572 12.281216
## 2021 12.526574 17.776611
plot(fcast_arima)
The best model is the Arima(p = 0, d = 1, q = 1)(P = 0, D = 1, Q = 2). We expect 5.58 and 16km in October and November, a high distance is expected in September.
df_ts <- data.frame(y = as.matrix(ts_dur),
ds = as.Date(as.yearmon(time(ts_dur)))) #prophet accepts df with colums ds and y
prophet_model <- prophet(df = df_ts, n.changepoints = 2,
daily.seasonality = FALSE,
weekly.seasonality = FALSE,
yearly.seasonality = TRUE,
holidays = NULL,
seasonality.mode = 'additive') # instancing the model
## Warning in .local(object, ...): non-zero return code in optimizing
## Optimization terminated abnormally. Falling back to Newton optimizer.
future <- make_future_dataframe(prophet_model, periods = 12, freq = 'month') # Creating the df for store current and predicted values
forecast <- predict(prophet_model, future) # Actual forecast
## Some graphics
dyplot.prophet(prophet_model, forecast)
prophet_plot_components(prophet_model, forecast)
tail(forecast[c('ds', 'yhat')], 12)
## ds yhat
## 106 2020-10-01 100.05595
## 107 2020-11-01 92.89806
## 108 2020-12-01 80.97163
## 109 2021-01-01 70.55495
## 110 2021-02-01 64.52417
## 111 2021-03-01 78.70355
## 112 2021-04-01 76.11203
## 113 2021-05-01 78.34668
## 114 2021-06-01 80.40239
## 115 2021-07-01 92.14649
## 116 2021-08-01 75.27488
## 117 2021-09-01 92.26922
We have a strong signal from July until September approximately.
arima_model <- auto.arima(ts_dur,
D = 1,
approximation = FALSE,
seasonal = TRUE,
allowmean = FALSE,
allowdrift = FALSE,
trace = TRUE)
##
## ARIMA(2,0,2)(1,1,1)[12] : Inf
## ARIMA(0,0,0)(0,1,0)[12] : 974.3359
## ARIMA(1,0,0)(1,1,0)[12] : 947.255
## ARIMA(0,0,1)(0,1,1)[12] : Inf
## ARIMA(1,0,0)(0,1,0)[12] : 972.2583
## ARIMA(1,0,0)(2,1,0)[12] : 938.0897
## ARIMA(1,0,0)(2,1,1)[12] : Inf
## ARIMA(1,0,0)(1,1,1)[12] : Inf
## ARIMA(0,0,0)(2,1,0)[12] : 945.7754
## ARIMA(2,0,0)(2,1,0)[12] : Inf
## ARIMA(1,0,1)(2,1,0)[12] : 940.3248
## ARIMA(0,0,1)(2,1,0)[12] : 938.8287
## ARIMA(2,0,1)(2,1,0)[12] : Inf
##
## Best model: ARIMA(1,0,0)(2,1,0)[12]
#summary(arima_model)
checkresiduals(arima_model)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,0)(2,1,0)[12]
## Q* = 53.533, df = 18, p-value = 2.169e-05
##
## Model df: 3. Total lags used: 21
fcast_arima <- forecast(arima_model, h = 12)
fcast_arima$mean
## Jan Feb Mar Apr May Jun Jul Aug
## 2020
## 2021 96.77785 53.61714 55.27590 79.21519 72.34404 67.30008 70.50087 98.62906
## Sep Oct Nov Dec
## 2020 66.78486 87.99443 85.12786
## 2021 97.44105
plot(fcast_arima)
The best model is the Arima(p = 1, d = 0, q = 0)(P = 2, D = 1, Q = 0). We expect 66.77 and 87.98min for Oct and Nov respectively and 98.63 and 97.44min for Aug and Sep, slightly different from prophet results (81.05 and 90.25min)
test_graph <- dat_run %>%
group_by(months, years) %>%
summarize(avg_duration =mean(duration,na.rm = T),
datetime = first(date),
.groups="keep"
)
freq = 12
ts_dur <- ts(test_graph$avg_duration, frequency = freq, start = c(2012, 1))
df_ts <- data.frame(y = as.matrix(ts_dur),
ds = as.Date(as.yearmon(time(ts_dur)))) #prophet accepts df with colums ds and y
prophet_model <- prophet(df = df_ts, n.changepoints = 2,
daily.seasonality = FALSE,
weekly.seasonality = FALSE,
yearly.seasonality = TRUE,
holidays = NULL,
seasonality.mode = 'additive') # instancing the model
future <- make_future_dataframe(prophet_model, periods = 12, freq = 'month') # Creating the df for store current and predicted values
forecast <- predict(prophet_model, future) # Actual forecast
## Cross validation
df.cv <- cross_validation(prophet_model, initial = 365 * 3, horizon = 365 * 1, units = "days")
## Making 10 forecasts with cutoffs between 2015-03-04 12:00:00 and 2019-09-02
t.test(df.cv$y, df.cv$yhat)
##
## Welch Two Sample t-test
##
## data: df.cv$y and df.cv$yhat
## t = 3.0075, df = 194.44, p-value = 0.002982
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 1.201953 5.781359
## sample estimates:
## mean of x mean of y
## 61.49892 58.00727
plot(df.cv$y, df.cv$yhat,
xlab = "Actual Values",
ylab = "Predicted Values",
ylim=c(20, 90),
xlim=c(20,90))
abline(a = 0, b = 1)
tail(forecast[c('ds', 'yhat')], 12)
## ds yhat
## 106 2020-10-01 69.20994
## 107 2020-11-01 64.52222
## 108 2020-12-01 56.48211
## 109 2021-01-01 66.99431
## 110 2021-02-01 62.70180
## 111 2021-03-01 63.39112
## 112 2021-04-01 56.48356
## 113 2021-05-01 59.74728
## 114 2021-06-01 59.44320
## 115 2021-07-01 68.13513
## 116 2021-08-01 55.38029
## 117 2021-09-01 66.39578
We expect a highest value in July, around 67min.
arima_model <- auto.arima(ts_dur,
D = 1,
approximation = FALSE,
seasonal = TRUE,
allowmean = FALSE,
allowdrift = FALSE,
trace = FALSE)
checkresiduals(arima_model)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,0,0)(2,1,0)[12]
## Q* = 48.937, df = 19, p-value = 0.0001876
##
## Model df: 2. Total lags used: 21
fcast_arima <- forecast(arima_model, h = 12)
fcast_arima$mean
## Jan Feb Mar Apr May Jun Jul Aug
## 2020
## 2021 62.49512 63.30861 59.70088 60.94988 62.08313 68.72334 62.90546 62.19069
## Sep Oct Nov Dec
## 2020 61.64133 64.33292 61.56281
## 2021 73.13248
being aware of the correlation among the residuals we do the prediction and obtain the maximum duration for swim in September (73.13min).
test_graph <- dat_swim %>%
group_by(months, years) %>%
summarize(avg_duration =mean(duration,na.rm = T),
datetime = first(date),
.groups="keep"
)
freq = 12
ts_dur <- ts(test_graph$avg_duration, frequency = freq, start = c(2012, 1))
df_ts <- data.frame(y = as.matrix(ts_dur),
ds = as.Date(as.yearmon(time(ts_dur)))) #prophet accepts df with colums ds and y
prophet_model <- prophet(df = df_ts, n.changepoints = 2,
daily.seasonality = FALSE,
weekly.seasonality = FALSE,
yearly.seasonality = TRUE,
holidays = NULL,
seasonality.mode = 'additive') # instancing the model
future <- make_future_dataframe(prophet_model, periods = 12, freq = 'month') # Creating the df for store current and predicted values
forecast <- predict(prophet_model, future) # Actual forecast
## Cross validation
df.cv <- cross_validation(prophet_model, initial = 365 * 3, horizon = 365 * 1, units = "days")
## Making 9 forecasts with cutoffs between 2015-06-03 and 2019-06-02
t.test(df.cv$y, df.cv$yhat)
##
## Welch Two Sample t-test
##
## data: df.cv$y and df.cv$yhat
## t = 1.194, df = 163.38, p-value = 0.2342
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.054193 4.278961
## sample estimates:
## mean of x mean of y
## 51.27626 49.66387
plot(df.cv$y, df.cv$yhat,
xlab = "Actual Values",
ylab = "Predicted Values",
ylim=c(20, 90),
xlim=c(20,90) )
abline(a = 0, b = 1)
tail(forecast[c('ds', 'yhat')], 12)
## ds yhat
## 103 2020-07-01 54.27265
## 104 2020-08-01 35.47524
## 105 2020-09-01 46.89626
## 106 2020-10-01 60.53629
## 107 2020-11-01 40.78029
## 108 2020-12-01 43.39367
## 109 2021-01-01 53.37159
## 110 2021-02-01 42.05829
## 111 2021-03-01 50.88724
## 112 2021-04-01 48.31481
## 113 2021-05-01 52.07390
## 114 2021-06-01 49.52420
arima_model <- auto.arima(ts_dur,
D = 1,
approximation = FALSE,
seasonal = TRUE,
allowmean = FALSE,
allowdrift = FALSE,
trace = FALSE)
checkresiduals(arima_model)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,0,1)(2,1,0)[12]
## Q* = 22.545, df = 17, p-value = 0.1647
##
## Model df: 3. Total lags used: 20
fcast_arima <- forecast(arima_model, h = 12)
fcast_arima$mean
## Jan Feb Mar Apr May Jun Jul Aug
## 2020 50.62644 52.97139
## 2021 48.97092 54.04121 52.26377 56.42058 58.84847 49.01564
## Sep Oct Nov Dec
## 2020 51.89501 45.17261 45.24033 40.18275
## 2021
We have not correlated residuals!
best.algo <- getBestModel(df_ts_cal$ds, df_ts_cal$y, "month",graph = T)
## Warning: Removed 744 row(s) containing missing values (geom_path).
best.algo
## $prepedTS
## $prepedTS$obj.ts
## Jan Feb Mar Apr May Jun Jul
## 2012 2020.3952 895.5545 991.8897 808.9866 815.5238 533.3956 664.6867
## 2013 771.6792 778.0633 492.7963 802.1261 459.7985 1132.2267 1222.5675
## 2014 939.2726 730.2006 1209.9384 699.4937 980.6657 1055.5545 1376.1652
## 2015 737.0406 1132.8125 1029.6493 1083.0193 975.0903 1046.1166 1261.8129
## 2016 1477.2945 1206.0229 878.9491 1079.5767 2514.4361 1874.6408 1332.3304
## 2017 1392.7942 1294.3597 2448.9031 1363.4417 670.8891 1355.5210 1595.3492
## 2018 1307.1542 749.6017 454.7458 1098.2015 596.1319 530.4478 973.4820
## 2019 834.2871 623.0444 580.0334 1065.6024 1520.2954 1066.6428 813.1772
## 2020 1324.7857 699.4503 956.2022 679.6982 752.2204 542.6843 634.0580
## Aug Sep Oct Nov Dec
## 2012 502.0998 1036.7395 2227.9517 823.9324 862.0300
## 2013 1000.1547 1175.3170 1564.6909 803.2494 599.8794
## 2014 1010.3592 896.0374 1404.3788 1724.7412 1446.3113
## 2015 2216.9220 2000.0314 988.1244 1276.2906 727.5633
## 2016 544.7897 1137.1096 598.9633 1109.6674 1300.9744
## 2017 824.3859 974.3209 1040.4949 1609.3622 1814.3126
## 2018 1587.8528 1889.7150 608.7954 901.7124 620.0805
## 2019 1166.4349 773.3611 560.8099 652.5529 970.8949
## 2020 600.0049 1482.6928
##
## $prepedTS$obj.df
## dates val
## 1 2012-01-01 2020.3952
## 2 2012-02-01 895.5545
## 3 2012-03-01 991.8897
## 4 2012-04-01 808.9866
## 5 2012-05-01 815.5238
## 6 2012-06-01 533.3956
## 7 2012-07-01 664.6867
## 8 2012-08-01 502.0998
## 9 2012-09-01 1036.7395
## 10 2012-10-01 2227.9517
## 11 2012-11-01 823.9324
## 12 2012-12-01 862.0300
## 13 2013-01-01 771.6792
## 14 2013-02-01 778.0633
## 15 2013-03-01 492.7963
## 16 2013-04-01 802.1261
## 17 2013-05-01 459.7985
## 18 2013-06-01 1132.2267
## 19 2013-07-01 1222.5675
## 20 2013-08-01 1000.1547
## 21 2013-09-01 1175.3170
## 22 2013-10-01 1564.6909
## 23 2013-11-01 803.2494
## 24 2013-12-01 599.8794
## 25 2014-01-01 939.2726
## 26 2014-02-01 730.2006
## 27 2014-03-01 1209.9384
## 28 2014-04-01 699.4937
## 29 2014-05-01 980.6657
## 30 2014-06-01 1055.5545
## 31 2014-07-01 1376.1652
## 32 2014-08-01 1010.3592
## 33 2014-09-01 896.0374
## 34 2014-10-01 1404.3788
## 35 2014-11-01 1724.7412
## 36 2014-12-01 1446.3113
## 37 2015-01-01 737.0406
## 38 2015-02-01 1132.8125
## 39 2015-03-01 1029.6493
## 40 2015-04-01 1083.0193
## 41 2015-05-01 975.0903
## 42 2015-06-01 1046.1166
## 43 2015-07-01 1261.8129
## 44 2015-08-01 2216.9220
## 45 2015-09-01 2000.0314
## 46 2015-10-01 988.1244
## 47 2015-11-01 1276.2906
## 48 2015-12-01 727.5633
## 49 2016-01-01 1477.2945
## 50 2016-02-01 1206.0229
## 51 2016-03-01 878.9491
## 52 2016-04-01 1079.5767
## 53 2016-05-01 2514.4361
## 54 2016-06-01 1874.6408
## 55 2016-07-01 1332.3304
## 56 2016-08-01 544.7897
## 57 2016-09-01 1137.1096
## 58 2016-10-01 598.9633
## 59 2016-11-01 1109.6674
## 60 2016-12-01 1300.9744
## 61 2017-01-01 1392.7942
## 62 2017-02-01 1294.3597
## 63 2017-03-01 2448.9031
## 64 2017-04-01 1363.4417
## 65 2017-05-01 670.8891
## 66 2017-06-01 1355.5210
## 67 2017-07-01 1595.3492
## 68 2017-08-01 824.3859
## 69 2017-09-01 974.3209
## 70 2017-10-01 1040.4949
## 71 2017-11-01 1609.3622
## 72 2017-12-01 1814.3126
## 73 2018-01-01 1307.1542
## 74 2018-02-01 749.6017
## 75 2018-03-01 454.7458
## 76 2018-04-01 1098.2015
## 77 2018-05-01 596.1319
## 78 2018-06-01 530.4478
## 79 2018-07-01 973.4820
## 80 2018-08-01 1587.8528
## 81 2018-09-01 1889.7150
## 82 2018-10-01 608.7954
## 83 2018-11-01 901.7124
## 84 2018-12-01 620.0805
## 85 2019-01-01 834.2871
## 86 2019-02-01 623.0444
## 87 2019-03-01 580.0334
## 88 2019-04-01 1065.6024
## 89 2019-05-01 1520.2954
## 90 2019-06-01 1066.6428
## 91 2019-07-01 813.1772
## 92 2019-08-01 1166.4349
## 93 2019-09-01 773.3611
## 94 2019-10-01 560.8099
## 95 2019-11-01 652.5529
## 96 2019-12-01 970.8949
## 97 2020-01-01 1324.7857
## 98 2020-02-01 699.4503
## 99 2020-03-01 956.2022
## 100 2020-04-01 679.6982
## 101 2020-05-01 752.2204
## 102 2020-06-01 542.6843
## 103 2020-07-01 634.0580
## 104 2020-08-01 600.0049
## 105 2020-09-01 1482.6928
##
## $prepedTS$freq.num
## [1] 12
##
## $prepedTS$freq.alpha
## [1] "month"
##
##
## $best
## [1] "my.bagged"
##
## $train.errors
## # A tibble: 1 x 8
## prophet ets sarima tbats bats stlm shortterm bagged
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 417. 353. 388. 293. 293. 322. 490. 293.
##
## $res.train
## # A tibble: 129 x 11
## dates type prophet ets sarima tbats bats stlm shortterm bagged
## <date> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2012-01-01 <NA> NA NA NA NA NA NA NA NA
## 2 2012-02-01 <NA> NA NA NA NA NA NA NA NA
## 3 2012-03-01 <NA> NA NA NA NA NA NA NA NA
## 4 2012-04-01 <NA> NA NA NA NA NA NA NA NA
## 5 2012-05-01 <NA> NA NA NA NA NA NA NA NA
## 6 2012-06-01 <NA> NA NA NA NA NA NA NA NA
## 7 2012-07-01 <NA> NA NA NA NA NA NA NA NA
## 8 2012-08-01 <NA> NA NA NA NA NA NA NA NA
## 9 2012-09-01 <NA> NA NA NA NA NA NA NA NA
## 10 2012-10-01 <NA> NA NA NA NA NA NA NA NA
## # ... with 119 more rows, and 1 more variable: actual.value <dbl>
##
## $algos
## $algos[[1]]
## [1] "my.prophet"
##
## $algos[[2]]
## [1] "my.ets"
##
## $algos[[3]]
## [1] "my.sarima"
##
## $algos[[4]]
## [1] "my.tbats"
##
## $algos[[5]]
## [1] "my.bats"
##
## $algos[[6]]
## [1] "my.stlm"
##
## $algos[[7]]
## [1] "my.shortterm"
##
##
## $graph.train
## Warning: Removed 744 row(s) containing missing values (geom_path).
#auto_ts <- prepare.ts(test_graph$datetime, test_graph$avg_sp, freq = 'month') #week month quarter day
#auto_ts
#plot.ts(auto_ts$obj.ts)
Time = attributes(ts_cal)[[1]]
Time = seq(Time[1],Time[2], length.out=(Time[2]-Time[1])*Time[3])
dat = cbind(Time, with(ts_cal_decompose, data.frame(Observed=x, Trend=trend, Seasonal=seasonal, Random=random)))
ggplot(gather(dat, component, value, -Time), aes(Time, value)) +
facet_grid(component ~ ., scales="free_y") +
geom_line() +
theme_bw() +
labs(y="", x="Year") +
ggtitle(expression(Decomposed~Calories~Time~Series~of~Bike~Activity)) +
theme(plot.title=element_text(hjust=0.5))
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.
## Warning: Removed 6 row(s) containing missing values (geom_path).